# Directory structure
github_dir <- file.path("G:/Shared drives/Nord Lab - Computational Projects/MIA/Canales_et_al_eLIFE_DE_repo/Canales_eLIFE_2021_DE")

setwd(github_dir)


# Global R markdown code chunk options
knitr::opts_chunk$set(message=FALSE, 
                      warning = FALSE, 
                      error=FALSE, 
                      echo=TRUE, 
                      cache=TRUE, 
                      fig.width = 7, fig.height = 6, 
                      fig.align = 'left')

# R packages
library(sva)
library(RUVnormalize)
library(pheatmap)
library(edgeR)
library(GenomicFeatures)
library(mclust)
library(parallel)
library(ggplot2)
library(reshape2)
library(dplyr)
library(data.table)
library(DT)
library(RColorBrewer)
library(knitr)
library(ggrepel)
library(gridExtra)
library(grid)
library(plyr)
library(plotly)
library(Hmisc)
# Reads input files

setwd(github_dir)

exp.data <- read.csv("./Extra_submission files/Gene_counts.csv")
rownames(exp.data) <- exp.data$X
exp.data$X <- NULL

sample.info <- read.csv("./Extra_submission files/metadata.csv")

load("./Extra_submission files/exonic.gene.sizes")

# Exonic gene sizes were calculated as follows:
# Generate mm9 list of exon sizes 
# Method description at: https://www.biostars.org/p/83901/

#txdb <- makeTxDbFromGFF("G:/Shared drives/Nord Lab - Personal #Folders/Karol/Genomes/Mus_musculus/UCSC/mm9/Annotation/genes.gtf", #format="gtf")
#exons.list.per.gene <- exonsBy(txdb,by="gene")

# I paralelized this increasing the speed of the process >2x on a 4-core (logical) machine. 
# Use mclapply instead of parLapply if you use a Mac.
#cl <- makeCluster(detectCores())
#exonic.gene.sizes <- parallel::parLapply(cl, #exons.list.per.gene,function(x){sum(width(reduce(x)))})
#stopCluster(cl)

1. Metadata

sample.info_submission <- read.csv("Extra_submission files/Supplementary Table 5, RNA-seq sample metadata_03_02_2021.csv")

datatable(sample.info_submission, 
          rownames = FALSE, 
          filter = list(position = 'top', clear = FALSE, plain = FALSE), 
          options = list(paging = FALSE, scrollX=T, scrollY="300px", searching = FALSE))

2. Count data quality control

2.1 Identification of sample sex

# Qualifies F or M based on the Xist expression
sex.by.rna <- c(ifelse(exp.data["Xist",]>1000,"F","M")) #

Xist_exp <- as.data.frame(reshape2::melt(exp.data["Xist",]))
Xist_exp <- cbind(Xist_exp, sex.by.rna)
Xist_exp <- arrange(Xist_exp, value)
colnames(Xist_exp) <- c("variable", "counts", "sex.by.rna")

ggplot(Xist_exp, aes(x=sex.by.rna, y=counts, colours=sex.by.rna))+
  geom_jitter()+
  geom_boxplot(alpha=0.2)+
  theme_bw()+
  labs(title="Xist read counts", x = "", y = "Read counts")+
  theme(plot.title = element_text(size = rel(2), hjust=0.5))+
  scale_x_discrete(breaks= c("F", "M"), labels=c("Females", "Males"))
Sample sex assignment based on *Xist* gene read count. Samples with less than 1000 *Xist* reads per sample are males, and samples with more than 1000 *Xist* reads are females.

Sample sex assignment based on Xist gene read count. Samples with less than 1000 Xist reads per sample are males, and samples with more than 1000 Xist reads are females.

2.1 Filtering genes with very low expression

# Calculates RPKM values
# Gene lengths calculated with lapply
gene.lengths <- as.numeric(lapply(1:nrow(exp.data), function(x) FUN= as.numeric(exonic.gene.sizes[rownames(exp.data)[x]])))

rpkm.data <- rpkm(exp.data, gene.length=gene.lengths, log=T, prior.count=.25)
rpkm.data_linear <- rpkm(exp.data, gene.length=gene.lengths, log=F)
rpkm.data_linear_all <- rpkm.data_linear

#write.csv(rpkm.data, file = "GEO_submission #files/rpkm_log2_24015.csv")
#write.csv(rpkm.data_linear, file = "GEO_submission #files/rpkm_linear_24015.csv")


### Removes genes with low expression ###

# Lets plot a histogram of all rpkm values in a dataset.
#dim(rpkm.data) #24015 rows and 74 columns

df <- reshape2::melt(rpkm.data)
df$rpkm_linear <- 2^df$value # This converts the log2 RPKM values back to linear scale
colnames(df) <- c("gene_name", "sample", "rpkm_log2","rpkm_linear")

# Histogram of all rpkm values in a dataset.
ggplot(df, aes(x=rpkm_log2))+ 
  geom_histogram(bins = 1000, color="black")+
  theme_bw()+
  labs(title="Unfiltered dataset of 24015 genes", 
       x="log2 RPKM", y = "Read counts")+
  theme(plot.title = element_text(hjust = 0.5))
Histograms of RPKM gene expression values before and after excluding genes with very low expression.

Histograms of RPKM gene expression values before and after excluding genes with very low expression.

# Setting the threshold for minimum rpkm value in at least 2 samples. 
threshold = -2

# Nymber of genes after filtering
#sum(reshape2::melt(rowSums(rpkm.data > threshold) >= 2)$value) # 17051

# Let's filter the dataset and plot the histogram distribution
keep <- as.data.frame(reshape2::melt(rowSums(rpkm.data > threshold) >= 2))
keep$gene_name <- rownames(keep)
keep <- filter(keep, value == "TRUE")$gene_name

# NOTE: The set of included genes reported in the manuscript was determined using expression criteria from an expanded set of samples including a condition that was not used for the manuscript. The difference between the gene counts is 17195 in the manuscript from the larger set of samples, compared to 17051 genes that would pass in the set of samples used in the MIA vs. control comparison.  Note that there is no difference among DEG passing FDR < 0.05 using either gene set.  

original_exp.data <- read.csv("Extra_submission files/Count_gene_set_from_initial_submission.csv")

# length(keep) # 17051
# length(original_exp.data$gene_name) # 17195

keep <- original_exp.data$gene_name

# 
datExpr_test <- as.data.frame(rpkm.data)
datExpr_test$gene_name <- rownames(datExpr_test)

# Filters the "keep" genes 
datExpr_test <- filter(datExpr_test, gene_name %in% keep)

# Generates a matrix-type data frame
rownames(datExpr_test) <- datExpr_test$gene_name
datExpr_test$gene_name <- NULL

# Overwrites the original datExpr object.
datExpr <- datExpr_test


# Plots the histogram after filtering
df <- reshape2::melt(datExpr)
df$rpkm_linear <- 2^df$value # This converts the log2 RPKM values back to linear scale
colnames(df) <- c("sample", "rpkm_log2","rpkm_linear")

ggplot(df, aes(x=rpkm_log2))+
  geom_histogram(bins = 1000, color="black")+
  theme_bw()+
  labs(title = "Dataset of 17195 genes \n after excluding genes expressed at a very low level", x="log2 RPKM", y = "Read counts")+
  theme(plot.title = element_text(hjust = 0.5))
Histograms of RPKM gene expression values before and after excluding genes with very low expression.

Histograms of RPKM gene expression values before and after excluding genes with very low expression.

# Removing low expression genes. Overwrites exp.data object used for DE analysis ### 
#dim(exp.data)  #exp.data contains counts, datExpr contains rpkms 

exp.data$gene_name <- rownames(exp.data)
exp.data <- filter(exp.data, gene_name %in% rownames(datExpr))

rownames(exp.data) <- exp.data$gene_name
exp.data$gene_name <- NULL 


# Recalculates rpkm values
# Gene lengths calculated with lapply
gene.lengths <- as.numeric(lapply(1:nrow(exp.data), function(x) FUN= as.numeric(exonic.gene.sizes[rownames(exp.data)[x]])))

# RPKM calculation
rpkm.data <- rpkm(exp.data, gene.length=gene.lengths, log=T, prior.count=.25)
rpkm.data_linear <- rpkm(exp.data, gene.length=gene.lengths, log=F)


#write.csv(rpkm.data, file = "GEO_submission #files/rpkm_log2_17051.csv")
#write.csv(rpkm.data_linear, file = "GEO_submission #files/rpkm_linear_17051.csv")

2.2 PCA plots

setwd(github_dir)

group <- ifelse(sample.info[,"Condition"]=="Saline",1,2)
rRNA <- as.numeric(sample.info$rRNA)
sex.by.rna <- factor(sample.info$sex.by.rna)
dpc <- factor(sample.info$DPC)
response <- factor(sample.info$Response)
lane <- factor(sample.info$Lane)

source("Dimensionality reduction plots.r")

grid.arrange(PCA_plot, PCA_plot_sex, ncol = 2)
**Figure 1bc.** Principal component analysis (PCA) of RNA-seq data indicates that developmental age accounts for the majority of variance across samples. Age (DPC) or sex are represented as colored symbols in (b - left) and (c - right) respectively, and poly(I:C) or saline treatment indicated by circles or triangles, respectively

Figure 1bc. Principal component analysis (PCA) of RNA-seq data indicates that developmental age accounts for the majority of variance across samples. Age (DPC) or sex are represented as colored symbols in (b - left) and (c - right) respectively, and poly(I:C) or saline treatment indicated by circles or triangles, respectively

3. Developmental time prediction model

# 19.5 was used as a numerical representation of the P0 time point.

test.dpc <- as.factor(sample.info$DPC)
test.sex.by.rna <- as.factor(sample.info$sex_by_rna)
test.response <- as.factor(sample.info$Response)
test.rRNA <- sample.info$rRNA
test.group <- as.factor(sample.info$Condition)
test.lane <- as.factor(sample.info$Lane)
test.data <- exp.data

min.cpm.criteria <- 0.1
min.cpm <- min.cpm.criteria

#qCML expression modeling
y <- DGEList(counts=test.data, group=test.group)
keep <- rowSums(cpm(y)>min.cpm) >=2
y <- y[keep, , keep.lib.sizes=FALSE]
## Perform simple exact test on group
y <- calcNormFactors(y)
y <- estimateCommonDisp(y)
y <- estimateTagwiseDisp(y)


test.pseudo <- y$pseudo.counts
pca.results <- prcomp(test.pseudo)

#PCA dataframe does not contain all samples as the exp.data, so I have to find which datapoints are control and polyic now
control.datapoints <- which(sample.info$Condition == "Saline")
polyic.datapoints <- which(sample.info$Condition == "PolyIC")

train.data <- data.frame(PCA.1=pca.results$rotation[control.datapoints,1],
                         PCA.2=pca.results$rotation[control.datapoints,2],
                         PCA.3=pca.results$rotation[control.datapoints,3],
                         PCA.4=pca.results$rotation[control.datapoints,4],
                         PCA.5=pca.results$rotation[control.datapoints,5],
                         rRNA=as.numeric(test.rRNA[control.datapoints]),
                         sex=sex.by.rna[control.datapoints],
                         lane=test.lane[control.datapoints],
                         response=test.response[control.datapoints]
                         
)

predict.data <- data.frame(PCA.1=pca.results$rotation[polyic.datapoints,1],
                         PCA.2=pca.results$rotation[polyic.datapoints,2],
                         PCA.3=pca.results$rotation[polyic.datapoints,3],
                         PCA.4=pca.results$rotation[polyic.datapoints,4],
                         PCA.5=pca.results$rotation[polyic.datapoints,5],
                         rRNA=as.numeric(test.rRNA[polyic.datapoints]),
                         sex=sex.by.rna[polyic.datapoints],
                         lane=test.lane[polyic.datapoints],
                         response=test.response[polyic.datapoints]
                         
)

lm.model <- lm(as.numeric(as.character(test.dpc[control.datapoints])) ~
                 PCA.1 + PCA.2 + PCA.3 + PCA.4 + PCA.5 + sex + lane,
                 data = train.data
)


# Predicts DPC from the linear model built on train data.
predict.saline <- predict(lm.model, newdata = train.data)
predict.polyic <- predict(lm.model, newdata = predict.data)

# I had some trouble with the original for loop and I replaced it with lapply. I also added the sd values and a plot representing modeled DPC values.
dpc.predict.mean.saline <- lapply(1:4, function(x) FUN=mean(predict.saline[which(as.character(test.dpc[control.datapoints])==unique(test.dpc)[x])]))

dpc.predict.sd.saline <- lapply(1:4, function(x) FUN=sd(predict.saline[which(as.character(test.dpc[control.datapoints])==unique(test.dpc)[x])]))

dpc.predict.mean.polyic <- lapply(1:4, function(x) FUN=mean(predict.polyic[which(as.character(test.dpc[polyic.datapoints])==unique(test.dpc)[x])]))

dpc.predict.sd.polyic <- lapply(1:4, function(x) FUN=sd(predict.polyic[which(as.character(test.dpc[polyic.datapoints])==unique(test.dpc)[x])]))

dpc.predict.mean.saline <- as.numeric(dpc.predict.mean.saline)
dpc.predict.sd.saline <- as.numeric(dpc.predict.sd.saline)
dpc.predict.mean.polyic <- as.numeric(dpc.predict.mean.polyic)
dpc.predict.sd.polyic <- as.numeric(dpc.predict.sd.polyic)


lm.predicted.dpc <- data.frame(dpc.predict.mean.saline, dpc.predict.sd.saline, dpc.predict.mean.polyic, dpc.predict.sd.polyic)


####
df_1 <- data.frame("Actual_DPC" = as.numeric(as.character(test.dpc[control.datapoints])),
                   "Predicted_DPC" = predict.saline, "Condition" = rep("Saline", length(predict.saline)))


df_2 <- data.frame("Actual_DPC" = as.numeric(as.character(test.dpc[polyic.datapoints])),
                   "Predicted_DPC" = predict.polyic, "Condition" = rep("Poly(I:C)", length(predict.polyic)))

df_combined <- rbind(df_1, df_2)


mean_data <- group_by(df_combined, Actual_DPC, Condition) %>%
             summarise(average_predicted_dpc = mean(Predicted_DPC, na.rm = TRUE),
                       sd_predicted_dpc = sd(Predicted_DPC, na.rm = TRUE)
                       )

mean_data <- as.data.frame(mean_data)

j_brew_colors <- brewer.pal(n = 8, name = "Paired")[c(6,2)]


df_combined$Condition <- ifelse(df_combined$Condition == "Saline", "Control", as.character(df_combined$Condition))
mean_data$Condition <- ifelse(mean_data$Condition == "Saline", "Control", as.character(mean_data$Condition))


Fig_4c <- ggplot()+
  geom_jitter(data=df_combined, aes(x = Actual_DPC, y = Predicted_DPC,  color=Condition), size = 2, height= 0.3, alpha=0.7)+
  geom_line(data=mean_data, aes(x = Actual_DPC, y = average_predicted_dpc, color=Condition), size=1.5, alpha=0.7)+
  theme_bw()+
  scale_color_manual(values=c("Control"=j_brew_colors[2], "Poly(I:C)" = j_brew_colors[1]))+
  scale_x_continuous(breaks= c(12.5, 14.5, 17.5, 19.5), labels=c("E12.5", "E14.5", "E17.5", "P0"))+
  scale_y_continuous(breaks= c(12.5, 14.5, 17.5, 19.5), labels=c("E12.5", "E14.5", "E17.5", "P0"))+
  labs(y = "Predicted DPC", x = "Actual DPC")+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

Fig_4c
**Figure 4c.** Temporal modeling of the RNA-seq data suggests acceleration of the neurodevelopmental program in MIA animals at E17.5 (P = 4.5e-06, Student’s t-test) and similar trend at E14.5 (P = 0.07, Student’s t-test). Actual age (X-axis) vs predicted age (Y-axis) calculated by the linear model. Control samples were used for training the model. Lines connect average values, points depict individual samples and are jittered along the X-axis.

Figure 4c. Temporal modeling of the RNA-seq data suggests acceleration of the neurodevelopmental program in MIA animals at E17.5 (P = 4.5e-06, Student’s t-test) and similar trend at E14.5 (P = 0.07, Student’s t-test). Actual age (X-axis) vs predicted age (Y-axis) calculated by the linear model. Control samples were used for training the model. Lines connect average values, points depict individual samples and are jittered along the X-axis.

3.1 Statistical analysis

#Nicer dataframe
predicted.dpc.df <- data.frame("Real_DPC"=c(12.5, 14.5, 17.5, 19.5), "Predicted_Saline_Mean_DPC"= dpc.predict.mean.saline, "Predicted_Saline_SD_DPC"=dpc.predict.sd.saline, "Predicted_PolyIC_Mean_DPC"= dpc.predict.mean.polyic, "Predicted_PolyIC_SD_DPC"=dpc.predict.sd.polyic)

##### T test ####
t.test.12 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="12.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="12.5")])

t.test.14 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="14.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="14.5")])

t.test.17 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="17.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="17.5")])

t.test.19 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="19.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="19.5")])

t.test.modeled.DPC.p.val <- data.frame("p-value E12.5"=t.test.12$p.value, "p-value E14.5"=t.test.14$p.value, "p-value E17.5"=t.test.17$p.value, "p-value E19.5"=t.test.19$p.value)

predicted_DPC_df <- reshape2::melt(data.frame("mean E12.5"=t.test.12$estimate, "mean E14.5"=t.test.14$estimate, "mean E17.5"=t.test.17$estimate, "mean E19.5"=t.test.19$estimate))


t_test_p_value=c(t.test.12$p.value, t.test.14$p.value, t.test.17$p.value, t.test.19$p.value)
specify_decimal <- function(x, k) trimws(format(round(x, k), nsmall=k))

predicted_DPC_df_nice <- data.frame("Real_DPC"= c(12.5, 14.5, 17.5, 19.5), "Predicted_DPC_Saline"=predicted_DPC_df$value[c(1,3,5,7)], "Predicted_DPC_PolyIC"=predicted_DPC_df$value[c(2,4,6,8)], "t_test_p_value"=specify_decimal(t_test_p_value, 8))

knitr::kable(predicted_DPC_df_nice)
Real_DPC Predicted_DPC_Saline Predicted_DPC_PolyIC t_test_p_value
12.5 12.83713 12.69296 0.77760404
14.5 14.62900 15.61387 0.08843077
17.5 16.80196 19.37435 0.00005209
19.5 19.51513 19.59144 0.70087733

4. Differential gene expression

single_timepoint_glm_function<- function(x){

control.datapoints <- intersect(control.datapoints, which(dpc == x))
polyic.datapoints <- intersect(polyic.datapoints, which(dpc == x))

use.cols <- c(control.datapoints, polyic.datapoints)

test.dpc <- dpc[use.cols]
test.sex.by.rna <- sex.by.rna[use.cols]
test.response <- response[use.cols]
test.rRNA <- as.numeric(rRNA)[use.cols]
test.group <- group[use.cols]
test.lane <- as.numeric(lane)[use.cols]
test.data <- exp.data[,use.cols]

min.cpm.criteria <- 0.1
min.cpm <- min.cpm.criteria

design <- model.matrix(~as.factor(test.lane) + as.factor(test.sex.by.rna) + as.numeric(test.group))
y <- DGEList(counts=test.data, group=group[use.cols])
keep <- rowSums(cpm(y)>min.cpm) >=2 #keeps only genes expressed in above min.cpm in at least 2 libraries in each group 
y <- y[keep, , keep.lib.sizes=FALSE]
y <- estimateGLMCommonDisp(y,design)
y <- estimateGLMTrendedDisp(y,design)
y <- estimateGLMTagwiseDisp(y,design)
fit <- glmFit(y,design) 
lrt <- glmLRT(fit) # Genewise Negative Binomial Generalized Linear Models.

glm.output <- topTags(lrt, n=Inf)
glm.output.full <- glm.output$table

glm.output.full$gene_name <- rownames(glm.output.full)
rownames(glm.output.full) <- NULL
glm.output.full <- glm.output.full[,c(6,1:5)]

glm.output.full

}

E12.5_GLM <- single_timepoint_glm_function(12.5)
E14.5_GLM <- single_timepoint_glm_function(14.5)
E17.5_GLM <- single_timepoint_glm_function(17.5)
E19.5_GLM <- single_timepoint_glm_function(19.5)

4.1 Numbers of DE genes

E12.5_up_n <- nrow(filter(E12.5_GLM, PValue < 0.05, logFC > 0))
E12.5_down_n <- nrow(filter(E12.5_GLM, PValue < 0.05, logFC < 0))

E14.5_up_n <- nrow(filter(E14.5_GLM, PValue < 0.05, logFC > 0))
E14.5_down_n <- nrow(filter(E14.5_GLM, PValue < 0.05, logFC < 0))

E17.5_up_n <- nrow(filter(E17.5_GLM, PValue < 0.05, logFC > 0))
E17.5_down_n <- nrow(filter(E17.5_GLM, PValue < 0.05, logFC < 0))

E19.5_up_n <- nrow(filter(E19.5_GLM, PValue < 0.05, logFC > 0))
E19.5_down_n <- nrow(filter(E19.5_GLM, PValue < 0.05, logFC < 0))

###

E12.5_up_n_FDR <- nrow(filter(E12.5_GLM, FDR < 0.05, logFC > 0))
E12.5_down_n_FDR <- nrow(filter(E12.5_GLM, FDR < 0.05, logFC < 0))

E14.5_up_n_FDR <- nrow(filter(E14.5_GLM, FDR < 0.05, logFC > 0))
E14.5_down_n_FDR <- nrow(filter(E14.5_GLM, FDR < 0.05, logFC < 0))

E17.5_up_n_FDR <- nrow(filter(E17.5_GLM, FDR < 0.05, logFC > 0))
E17.5_down_n_FDR <- nrow(filter(E17.5_GLM, FDR < 0.05, logFC < 0))

E19.5_up_n_FDR <- nrow(filter(E19.5_GLM, FDR < 0.05, logFC > 0))
E19.5_down_n_FDR <- nrow(filter(E19.5_GLM, FDR < 0.05, logFC < 0))


DE_df_n <- t(data.frame("E12.5" =c(E12.5_up_n, E12.5_down_n, E12.5_up_n_FDR, E12.5_down_n_FDR),
           "E14.5" =c(E14.5_up_n, E14.5_down_n, E14.5_up_n_FDR, E14.5_down_n_FDR),
           "E17.5" =c(E17.5_up_n, E17.5_down_n, E17.5_up_n_FDR, E17.5_down_n_FDR),
           "P0" =c(E19.5_up_n, E19.5_down_n, E19.5_up_n_FDR, E19.5_down_n_FDR)))

colnames(DE_df_n) <- c("Upregulated", "Downregulated", "Upregulated", "Downregulated")


DE_df_n_melted <- reshape2::melt(DE_df_n)

DE_df_n_melted$stat <- c(rep(", P < 0.05", 8), rep(", FDR < 0.05", 8))

DE_df_n_melted$Var2_stat <- paste(DE_df_n_melted$Var2, DE_df_n_melted$stat)


Fig1f <- ggplot(DE_df_n_melted, aes(fill=Var2_stat, group=Var2, x=Var1, y=value))+
  geom_bar(position = "dodge",  stat="identity", color="black")+
  theme_bw()+
  #scale_fill_manual(values=brewer.pal(n = 8, name = "Paired")[c(6,6,2,2 )])+
  scale_fill_manual(values=c("#1F78B4", "#62a0ca", "#E31A1C", "#eb5e60"))+
  theme(legend.title=element_blank())+
  labs(title="Number of DE genes with P < 0.05 and FDR < 0.05", x="", y="")+
  theme(plot.title = element_text(size = rel(1.2), hjust=0.5))+
  geom_text(aes(label=value), position=position_dodge(width=0.9), hjust=-0.2)+
  coord_flip()+
  scale_x_discrete(limits = rev(levels(DE_df_n_melted$Var1)))+
  theme(panel.border = element_blank(),
        #legend.key = element_blank(),
        axis.ticks = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_text(angle = 90, size = 14, face = "bold", 
                                   hjust = 0.5, vjust = -4),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  theme(legend.position="bottom")

Fig1f
**Figure 1f.** Numbers of upregulated and downregulated DE genes at FDR < 0.05 and P < 0.05 thresholds show varying DE gene numbers with a peak of dysregulated genes at E17.5.

Figure 1f. Numbers of upregulated and downregulated DE genes at FDR < 0.05 and P < 0.05 thresholds show varying DE gene numbers with a peak of dysregulated genes at E17.5.

4.2 Interactive volcano plots and DE tables

To improve the performance of interactive volcano plots, only genes with P < 0.05 are displayed. Tables contain full DE data sets.

volcano_plot_text_2 <- function(x, title) {

x <- dplyr::filter(x, PValue <0.05)
  
test <- ifelse(x$PValue, "Non significant")
test <- ifelse(x$logFC > 0 & x$PValue <0.05, "PValue < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$PValue <0.05, "PValue < 0.05 & logFC < 0", test)
test <- ifelse(x$logFC > 0 & x$FDR <0.05, "FDR < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$FDR <0.05, "FDR < 0.05 & logFC < 0", test)

plotDat <- data.frame(x, Group=test)

plotDat$logFC <- ifelse(plotDat$logFC > 4, 4, plotDat$logFC)
plotDat$logFC <- ifelse(plotDat$logFC < -4, -4, plotDat$logFC)


p <- ggplot(plotDat, aes(x = logFC, y=-log10(PValue), fill=Group, col = Group)) +
  geom_point(aes(text=gene_name), size=0.5, pch=21, alpha=0.7, stroke = 0.5)+
  theme_light()+
 scale_fill_manual(values=c("Non significant"="grey30", 
                             "PValue < 0.05 & logFC > 0"="#eb5e60", 
                             "PValue < 0.05 & logFC < 0"="#62a0ca", 
                             "FDR < 0.05 & logFC > 0" = "#960304", 
                             "FDR < 0.05 & logFC < 0" = "#01538a"))+
   scale_color_manual(values=c("Non significant"="grey30", 
                             "PValue < 0.05 & logFC > 0"="#eb5e60", 
                             "PValue < 0.05 & logFC < 0"="#62a0ca", 
                             "FDR < 0.05 & logFC > 0" = "#960304", 
                             "FDR < 0.05 & logFC < 0" = "#01538a"))+
  labs(title= title, y="", x="")+
  theme(plot.title = element_text(size = rel(2), hjust=0.5))+
  theme(legend.text=element_text(size=16))+
  theme(axis.text=element_text(size=14))+
  theme(legend.title=element_blank())+
  theme(axis.text=element_text(size=14))+
  theme(axis.title = element_text(size=14, face = "bold"))+
  theme(legend.position="bottom")+
  coord_cartesian(xlim = c(-4.2, 4.2))+
  geom_hline(yintercept = -log10(0.05), linetype=2)+
   scale_x_continuous(breaks=c(-4, -2, 0, 2, 4), labels=c("<-4","-2", "0", "2", ">4"))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

ggplotly(p) %>%
  layout(legend = list(
      orientation = "v",
      y = -0.1, 
      font=list(
            size=14
        )
    )
  )
}


#################  Version capped at different y axis levels ###############

volcano_plot_text_3 <- function(x, title) {

x <- dplyr::filter(x, PValue <0.05)
  
test <- ifelse(x$PValue, "Non significant")
test <- ifelse(x$logFC > 0 & x$PValue <0.05, "PValue < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$PValue <0.05, "PValue < 0.05 & logFC < 0", test)
test <- ifelse(x$logFC > 0 & x$FDR <0.05, "FDR < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$FDR <0.05, "FDR < 0.05 & logFC < 0", test)


plotDat <- data.frame(x, Group=test)

plotDat$logFC <- ifelse(plotDat$logFC > 4, 4, plotDat$logFC)
plotDat$logFC <- ifelse(plotDat$logFC < -4, -4, plotDat$logFC)

plotDat$log10_PValue <- -log10(plotDat$PValue)

plotDat$log10_PValue <- ifelse(plotDat$log10_PValue > 8, 8, plotDat$log10_PValue)

#scale_fill_manual(values=c("#1F78B4" - blue, "#62a0ca" - lighter blue, "#E31A1C" - red, "#eb5e60" - lighter red))+

p <- ggplot(plotDat, aes(x = logFC, y=log10_PValue, fill=Group, col = Group)) +
  geom_point(aes(text=gene_name), size=0.5, pch=21, alpha=0.7, stroke = 0.5)+
  theme_light()+
  scale_fill_manual(values=c("Non significant"="grey30", 
                             "PValue < 0.05 & logFC > 0"="#eb5e60", 
                             "PValue < 0.05 & logFC < 0"="#62a0ca", 
                             "FDR < 0.05 & logFC > 0" = "#960304", 
                             "FDR < 0.05 & logFC < 0" = "#01538a"))+
   scale_color_manual(values=c("Non significant"="grey30", 
                             "PValue < 0.05 & logFC > 0"="#eb5e60", 
                             "PValue < 0.05 & logFC < 0"="#62a0ca", 
                             "FDR < 0.05 & logFC > 0" = "#960304", 
                             "FDR < 0.05 & logFC < 0" = "#01538a"))+
  labs(title= title, y="", x="")+
  theme(plot.title = element_text(size = rel(2), hjust=0.5))+
  theme(legend.text=element_text(size=16))+
  theme(axis.text=element_text(size=14))+
  theme(legend.title=element_blank())+
  theme(axis.text=element_text(size=14))+
  theme(axis.title = element_text(size=14, face = "bold"))+
  theme(legend.position="bottom")+
  coord_cartesian(xlim = c(-4.2, 4.2))+
  expand_limits(x = c(-4.2, 4.2))+
  coord_cartesian(ylim = c(0, 8))+
  geom_hline(yintercept = -log10(0.05), linetype=2)+
  scale_x_continuous(breaks=c(-4, -2, 0, 2, 4), labels=c("<-4","-2", "0", "2", ">4"))+
  scale_y_continuous(breaks=c(0, 2, 4, 6, 8), labels=c("0","2", "4", "6", ">8"))+
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())


ggplotly(p) %>%
  layout(legend = list(
      orientation = "v",
      y = -0.1, 
      font=list(
            size=14
        )
    )
  )
}

4.2.1 E12.5

p <- volcano_plot_text_3(E12.5_GLM, "E12.5")
p

Figure 1e.

datatable(E12.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))

4.2.2 E14.5

p <- volcano_plot_text_3(E14.5_GLM, "E14.5")
p

Figure 1e.

datatable(E14.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))

4.2.3 E17.5

p <- volcano_plot_text_2(E17.5_GLM, "E17.5")
p

Figure 1e.

datatable(E17.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))

4.2.4 P0

p <- volcano_plot_text_3(E19.5_GLM, "P0")
p

Figure 1e.

datatable(E19.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))

4.3 Heatmap of DE genes

#Build dataframe of logFC values
padding <- 0
a <- data.frame("gene_name"=E12.5_GLM$gene_name, "E12.5_logFC"=E12.5_GLM$logFC+padding)
b <- data.frame("gene_name"=E14.5_GLM$gene_name, "E14.5_logFC"=E14.5_GLM$logFC+padding)
c <- data.frame("gene_name"=E17.5_GLM$gene_name, "E17.5_logFC"=E17.5_GLM$logFC+padding)
d <- data.frame("gene_name"=E19.5_GLM$gene_name, "E19.5_logFC"=E19.5_GLM$logFC+padding)

e <- merge(a, b, by="gene_name")
f <- merge(c,d, by="gene_name")
df <- merge(e,f, by="gene_name")




FDR_threshold <- 0.05
logFC_threshold <- 1

a <- dplyr::filter(E12.5_GLM, FDR < FDR_threshold & 
              logFC > logFC_threshold | logFC < -1*logFC_threshold)$gene_name
b <- dplyr::filter(E14.5_GLM, FDR < FDR_threshold & 
              logFC > logFC_threshold | logFC < -1*logFC_threshold)$gene_name
c <- dplyr::filter(E17.5_GLM, FDR < FDR_threshold & 
              logFC > logFC_threshold | logFC < -1*logFC_threshold)$gene_name
d <- dplyr::filter(E19.5_GLM, FDR < FDR_threshold & 
              logFC > logFC_threshold | logFC < -1*logFC_threshold)$gene_name

DE_genes_logFC_1_20_top <- unique(c(a, b, c, d))

#length(DE_genes_logFC_1_20_top)

#Filter DE data frame
df_1 <- dplyr::filter(df, gene_name %in% DE_genes_logFC_1_20_top)

rownames(df_1) <- df_1$gene_name
df_1$gene_name <- NULL

colnames(df_1) <- c("E12.5", "E14.5", "E17.5", "P0")

rownames(df_1) <- NULL
df_1 <- df_1[,c(1:4)]

paletteLength <- 100
test <- as.matrix(df_1)

myBreaks <- c(seq(min(test), 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(max(test)/paletteLength, max(test), length.out=floor(paletteLength/2)))

myColor <- colorRampPalette(c("#9E0041", "#fea11d","#fea11d", "white", "#43abea", "#0a48aa", "#13008e"))(paletteLength)


Fig_1d <- pheatmap(as.matrix(df_1), 
         clustering_distance_rows="correlation", 
         cluster_rows = T, 
         cluster_cols = F, 
         legend = T,
         angle_col = 0,
         legend_breaks = c(-4, -2, 0, 2, 4, max(df_1)),
         legend_labels = c("-4", "-2", "0", "2", "4","log2FC"),
         fontsize_row  = gene_name_font_size, 
         color = rev(myColor), 
         breaks = myBreaks,
         main = bquote(atop("Heatmap of DE genes",
                       "FDR < 0.05 and log" [2] *"FC >1 or <-1")))
        

Fig_1d
**Figure 1d.** Heatmap representing relative gene expression changes between control and MIA samples across time-points. Hierarchical clustering by relative fold changes shows stage-specific differential expression signatures (DE genes shown have FDR < 0.05 and log2 fold change (log2FC) > 1 or < -1).

Figure 1d. Heatmap representing relative gene expression changes between control and MIA samples across time-points. Hierarchical clustering by relative fold changes shows stage-specific differential expression signatures (DE genes shown have FDR < 0.05 and log2 fold change (log2FC) > 1 or < -1).

5. SFARI gene enrichment among DE genes

SFARI_genes <- read.csv("./Extra_submission files/SFARI-Gene_genes_01-15-2019release_02-26-2019export.csv")

#head(SFARI_genes)

#We are using only high confidence genes associated with autism
SFARI_genes_filtered <- filter(SFARI_genes, gene.score <3)


require("biomaRt")

#Function translating human to mouse orthologs
humanToMouse <- function(x){
  
  human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
  mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
  
  # comparison between mouse and human, returns mouse gene equivalents
  genes = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = x , mart = human, attributesL = c("mgi_symbol"), martL = mouse, uniqueRows=T)
  
  mousex <- unique(genes[, 2])
  return(mousex)
}

#Returning a df matching orthologs
humanToMouse_2 <- function(x){
  
  human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
  mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
  
  # comparison between mouse and human, returns mouse gene equivalents
  genes = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = x , mart = human, attributesL = c("mgi_symbol"), martL = mouse, uniqueRows=T)
  
  mousex <- genes
  return(mousex)
}

#Translating orthologs
SFARI_genes_mouse <- humanToMouse_2(SFARI_genes$gene.symbol)
#SFARI_genes_mouse

SFARI_genes_with_ortho <- merge(SFARI_genes, SFARI_genes_mouse, by.x= "gene.symbol", by.y = "HGNC.symbol")

#making sure still have all the 87 SFARI genes with gene.score 1 and 2
x <- filter(SFARI_genes_with_ortho, gene.score %in% c(1,2))[,c(1,9)]


SFARI_df_to_save <- filter(SFARI_genes_with_ortho, gene.score %in% c(1,2))[ ,c(1, 3:9)]
colnames(SFARI_df_to_save)[8] <- "MGI.symbol_ortholog"

#write.csv(SFARI_df_to_save, file="Supplementary Table 1, #High_confidence_SFARI_mouse_orthologs.csv",row.names = F)

#length(unique(x$gene.symbol)) # 87 Human SFARI genes
#length(unique(x$MGI.symbol))  # 89 mouse orthologs;  MSNP1AS doesn't have an ortholog; DDX3X has 2 orthologs, SRCAP has 2 orthologs too. 

#setdiff(SFARI_genes_filtered$gene.symbol, SFARI_genes_with_ortho$gene.symbol)

#### Circular enrichment plot and permutation test enrichment analysis#### 
`%notin%` <- Negate(`%in%`)

SFARI_DE_df_annotation <- function(x){

cat_1 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 1)$MGI.symbol)
cat_2 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 2)$MGI.symbol)
cat_3 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 3)$MGI.symbol)
cat_4 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 4)$MGI.symbol)
cat_5 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 5)$MGI.symbol)
cat_6 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 6)$MGI.symbol)
cat_NA <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == "NA")$MGI.symbol)
cat_Non_Ausitm <- filter(x, gene_name %notin% filter(SFARI_genes_with_ortho, gene.score %in% c(1, 2,3,4,5,6, "NA"))$MGI.symbol)

cat_1$SFARI_category <- rep(1, nrow(cat_1))
cat_2$SFARI_category <- rep(2, nrow(cat_2))
cat_3$SFARI_category <- rep(3, nrow(cat_3))
cat_4$SFARI_category <- rep(4, nrow(cat_4))
cat_5$SFARI_category <- rep(5, nrow(cat_5))
cat_6$SFARI_category <- rep(6, nrow(cat_6))
cat_NA$SFARI_category <- rep(NA, nrow(cat_NA))
cat_Non_Ausitm$SFARI_category <- rep("Non_Ausitm", nrow(cat_Non_Ausitm))

test <- rbind(cat_1, cat_2, cat_3, cat_4, cat_5, cat_6, cat_NA, cat_Non_Ausitm)

test$Up_or_Down <- ifelse(test$logFC > 0, "Upregulated", "Downregulated")

test

}

E12.5_GLM_SFARI_cat <- SFARI_DE_df_annotation(E12.5_GLM)
E14.5_GLM_SFARI_cat <- SFARI_DE_df_annotation(E14.5_GLM)
E17.5_GLM_SFARI_cat <- SFARI_DE_df_annotation(E17.5_GLM)
E19.5_GLM_SFARI_cat <- SFARI_DE_df_annotation(E19.5_GLM)


E12.5_GLM_SFARI_cat$DPC <- rep("E12.5", nrow(E12.5_GLM_SFARI_cat))
E14.5_GLM_SFARI_cat$DPC <- rep("E14.5", nrow(E14.5_GLM_SFARI_cat))
E17.5_GLM_SFARI_cat$DPC <- rep("E17.5", nrow(E17.5_GLM_SFARI_cat))
E19.5_GLM_SFARI_cat$DPC <- rep("E19.5", nrow(E19.5_GLM_SFARI_cat))

#Checking the number of SFARI genes with gene.score 1 or 2 at each timepoint
#filter(E12.5_GLM_SFARI_cat, SFARI_category %in% c(1,2)) #82 genes
#filter(E14.5_GLM_SFARI_cat, SFARI_category %in% c(1,2)) #82 genes
#filter(E17.5_GLM_SFARI_cat, SFARI_category %in% c(1,2)) #82 genes
#filter(E19.5_GLM_SFARI_cat, SFARI_category %in% c(1,2)) #82 genes


SFARI_genes_filtered_mouse <- x$MGI.symbol

#Double checking how many of the SFARI orthologs overlap with genes in DE analysis
#length(unique(SFARI_genes_filtered_mouse)) #There are 89 mouse orthologs of gene.score 1 and 2 SFARI genes

#filter(E12.5_GLM, gene_name %in% SFARI_genes_filtered_mouse) #82 genes
#filter(E14.5_GLM, gene_name %in% SFARI_genes_filtered_mouse) #82 genes
#filter(E17.5_GLM, gene_name %in% SFARI_genes_filtered_mouse) #82 genes
#filter(E19.5_GLM, gene_name %in% SFARI_genes_filtered_mouse) #82 genes
#The missing 7 genes are just not sufficiently expressed. 


##### Stacked circular bar plot - the one in figure 2 #####
df <- rbind(E12.5_GLM_SFARI_cat, E14.5_GLM_SFARI_cat, E17.5_GLM_SFARI_cat, E19.5_GLM_SFARI_cat)

# Filtering SFARI genes with gene.score 1 and 2
df_test <- filter(df, SFARI_category %in% c(1,2))


#I'm replacing the real values of logFC to 1 and -1 to build a stacked bar plot
df_test$logFC <- ifelse(df_test$logFC > 0, 1, -1)

#Annotates if a gene is significant
df_test$Sig <- as.character(ifelse(df_test$PValue < 0.05, "DE", "Non-DE"))

#A combination fo DPC and significance
df_test$DPC_Sig <- paste(df_test$DPC, df_test$Sig, sep="_")

my_col <- brewer.pal(n = 8, name = "Set1")

#head(df_test)


#Add Up or down
 df_test$dir <- ifelse(df_test$logFC > 0, "Up", "Down")  # Redundant
 df_test$DPC_Sig_dir <- paste(df_test$DPC, df_test$Sig, df_test$dir, sep = "_")

 df_test <- arrange(df_test, gene_name)

#Yes, I checked the gene names are the same in the columns
#all(filter(df_test, DPC == "E12.5")$gene_name == filter(df_test, DPC == "E14.5")$gene_name)
#all(filter(df_test, DPC == "E17.5")$gene_name == filter(df_test, DPC == "E19.5")$gene_name)
#all(filter(df_test, DPC == "E12.5")$gene_name == filter(df_test, DPC == "E17.5")$gene_name)

## A better matrix for clustering with 0 of non-DE logFC values
df_test$logFC_sig <- ifelse(df_test$PValue > 0.05, 0, df_test$logFC)
  
df_matrix <- data.frame("gene_name" = filter(df_test, DPC == "E12.5")$gene_name,
           "E12.5" = filter(df_test, DPC == "E12.5")$logFC_sig,
           "E14.5" = filter(df_test, DPC == "E14.5")$logFC_sig,
           "E17.5" = filter(df_test, DPC == "E17.5")$logFC_sig,
           "E19.5" = filter(df_test, DPC == "E19.5")$logFC_sig)  

#head(df_matrix)

rownames(df_matrix) <- df_matrix$gene_name
df_matrix$gene_name <- NULL

# Dissimilarity matrix
d <- dist(df_matrix, method = "euclidean")

# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "ward.D2" )

# Plot dendrogram
#plot(hc1, cex = 0.6, hang = -1)
 
#head(df_test)
 
df_test$gene_name <- factor(df_test$gene_name, levels = unique(df_test$gene_name)[hc1$order])   
df_test <- arrange(df_test, gene_name) 

### Angle of labels
step <- 360/length(unique(df_test$gene_name))
myAng <- seq(0, 360, step)
myAng <- rev(myAng)
myAng <- myAng - 90
myAng <- ifelse(myAng > 90 & myAng < 270, myAng-180, myAng)

#Adding FDR
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == -1 & df_test$DPC == "E12.5", "E12.5_Down_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == -1 & df_test$DPC == "E14.5", "E14.5_Down_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == -1 & df_test$DPC == "E17.5", "E17.5_Down_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == -1 & df_test$DPC == "E19.5", "E19.5_Down_FDR", df_test$DPC_Sig_dir)

df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == 1 & df_test$DPC == "E12.5", "E12.5_Up_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == 1 & df_test$DPC == "E14.5", "E14.5_Up_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == 1 & df_test$DPC == "E17.5", "E17.5_Up_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == 1 & df_test$DPC == "E19.5", "E19.5_Up_FDR", df_test$DPC_Sig_dir)



df_test$Sig_dir <- ifelse(df_test$PValue < 0.05 & df_test$logFC == -1, "P_Down", "Non-Sig")
df_test$Sig_dir <- ifelse(df_test$PValue < 0.05 & df_test$logFC == 1, "P_Up", df_test$Sig_dir)
df_test$Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == -1, "P_Down", df_test$Sig_dir)
df_test$Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == 1, "P_Up", df_test$Sig_dir)


##### 

Fig1g <- ggplot(df_test, aes(x=gene_name, y = abs(logFC), fill=DPC_Sig_dir))+
  geom_bar(stat = "identity", alpha = 1, position = "stack")+
  annotate("rect", xmin=0, xmax=Inf, ymin=-1, ymax=0, alpha=1, fill="white")+
  annotate("rect", xmin=0, xmax=Inf, ymin=0, ymax=1, alpha=0.5, fill="grey10")+
  annotate("rect", xmin=0, xmax=Inf, ymin=1, ymax=2, alpha=0.5, fill="grey30")+
  annotate("rect", xmin=0, xmax=Inf, ymin=2, ymax=3, alpha=0.5, fill="grey60")+
  annotate("rect", xmin=0, xmax=Inf, ymin=3, ymax=4, alpha=0.5, fill="grey90")+
  geom_bar(stat = "identity", alpha = 1, position = "stack")+
  coord_polar()+
  theme_minimal()+
  geom_hline(yintercept = 0, size=1, linetype=1)+
  geom_hline(yintercept = 1, size=1, linetype=1)+
  geom_hline(yintercept = 2, size=1, linetype=1)+
  geom_hline(yintercept = 3, size=1, linetype=1)+
  geom_hline(yintercept = 4, size=1, linetype=1)+
  theme(axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())+
  labs(x="", y="")+
  theme(legend.position = "none")+
  scale_fill_manual(values = c(
    "E12.5_DE_Up"=  "#eb8889",
    "E14.5_DE_Up" = "#eb8889",
    "E17.5_DE_Up" = "#eb8889",
    "E19.5_DE_Up" = "#eb8889",
    "E12.5_Non-DE_Up"=  NA,
    "E14.5_Non-DE_Up" = NA,
    "E17.5_Non-DE_Up" = NA,
    "E19.5_Non-DE_Up" = NA,
    "E12.5_DE_Down"=  "#83aecc",
    "E14.5_DE_Down" = "#83aecc",
    "E17.5_DE_Down" = "#83aecc",
    "E19.5_DE_Down" = "#83aecc",
    "E12.5_Non-DE_Down"=  NA,
    "E14.5_Non-DE_Down" = NA,
    "E17.5_Non-DE_Down" = NA,
    "E19.5_Non-DE_Down" = NA,
    ############################
    "E12.5_Down_FDR" = my_col[2],
    "E14.5_Down_FDR" = my_col[2],
    "E17.5_Down_FDR" = my_col[2],
    "E19.5_Down_FDR" = my_col[2],
    
    "E12.5_Up_FDR" = my_col[1],
    "E14.5_Up_FDR" = my_col[1],
    "E17.5_Up_FDR" = my_col[1],
    "E19.5_Up_FDR" = my_col[1]

  ))+
  ylim(-1,4)+
  theme(axis.text.x = element_text(angle = myAng, size =13, color="black"))+
  theme(panel.border = element_blank(),
        legend.key = element_blank(),
        axis.ticks = element_blank(),
        axis.text.y = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank())+
  annotate(geom = "text", x = 72, y = 0.5, label = "P0", color = "black",
             angle = 50, size =10)+
  annotate(geom = "text", x = 72, y = 1.46, label = "E17.5", color = "black",
             angle = 50, size =10)+
  annotate(geom = "text", x = 72, y = 2.47, label = "E14.5", color = "black",
             angle = 50, size =10)+
 annotate(geom = "text", x = 72, y = 3.47, label = "E12.5", color = "black",
             angle = 50, size =10)

Fig1g
**Figure 1g.** Intersection of stage-specific DE genes with the 82 SFARI autism-associated mouse gene orthologs that were expressed at measurable levels in the RNA-seq data. E17.5 DE genes (FDR < 0.05) were enriched for SFARI genes (P = 3.9e-04, hypergeometric test). Concentric circles represent developmental time-points. Light red, upregulated DE genes (P < 0.05); dark red, upregulated DE genes (FDR < 0.05); light blue, downregulated DE genes (P < 0.05); dark blue, downregulated DE genes (FDR < 0.05).

Figure 1g. Intersection of stage-specific DE genes with the 82 SFARI autism-associated mouse gene orthologs that were expressed at measurable levels in the RNA-seq data. E17.5 DE genes (FDR < 0.05) were enriched for SFARI genes (P = 3.9e-04, hypergeometric test). Concentric circles represent developmental time-points. Light red, upregulated DE genes (P < 0.05); dark red, upregulated DE genes (FDR < 0.05); light blue, downregulated DE genes (P < 0.05); dark blue, downregulated DE genes (FDR < 0.05).

5.1 SFARI genes enrichment statistical tests

5.1.1 Permutation test

##### Permutation test per GOmpers et al. #####
# Code adopted from Alex's analysis

#All genes in the background

#Enrichment fo SFARI genes among DE genes.
permutation_test <-  function(x, plot.name){
# x <- E17.5_GLM_module_SFARI_cat
  
binary.score.reference <- ifelse(x$gene_name %in% filter(x, SFARI_category %in% c(1,2))$gene_name, 1,0)

binary.score.test <- ifelse(filter(x, PValue < 0.05)$gene_name %in% filter(x, PValue <0.05, SFARI_category %in% c(1,2))$gene_name, 1,0)


geneset.perm.test <- function(binary.score.reference, binary.score.test, iterations, plot.name) {
  
  set.seed(1234)
  
  binary.score.reference <- ifelse(is.na(binary.score.reference),0,binary.score.reference)
  binary.score.test <- ifelse(is.na(binary.score.test),0,binary.score.test)
  count.criteria <- vector(length=iterations)
  test.size <- length(binary.score.test)
  for (index in 1:iterations) {
    count.criteria[index] <- sum(sample(binary.score.reference, test.size, replace = F))
  }
  x.min <- min(c(count.criteria, sum(binary.score.test))) - 20
  if (x.min < 0) {x.min <- 0}
  x.max<- max(c(count.criteria, sum(binary.score.test))) + 20
  z <- abs(mean(count.criteria) - sum(binary.score.test))/sd(count.criteria)
  p <- 2*pnorm(-abs(z))
  e <- sum(binary.score.test)/mean(count.criteria)
  prop <- sum(binary.score.test)/length(binary.score.test)
  bg <- length(binary.score.reference)
  
  hist(count.criteria, col="gray", xlab="Count", ylab="Frequency", 
       
       main=paste(plot.name, " \ncount=", sum(binary.score.test), " of ",  length(binary.score.test), "; p-value=", format(p, scientific = T, digits = 2), sep=""),
       
       xlim=c(x.min,x.max), cex=.5)
  
 # Original title with more parameters
#main=paste(plot.name, " \ncount=", sum(binary.score.test), " of ",  #length(binary.score.test), "; p-value=", format(p, scientific = T, digits = #2), "; FE=", format(e, scientific = F, digits = 2), "; Prop=", format(prop, #scientific = F, digits = 2), "; n(bg)=", bg, sep=""),
  
  
  abline(v=sum(binary.score.test), lwd=3, col="red")
  #return(list(count.criteria,z,p,e,prop,bg,sum(binary.score.test)))
}

geneset.perm.test(binary.score.reference, binary.score.test, 100000, plot.name)

}


par(mfrow=c(2,2), mai=c(0.9,0.9,0.9,0.9))

permutation_test(E12.5_GLM_SFARI_cat, "E12.5")
permutation_test(E14.5_GLM_SFARI_cat, "E14.5")
permutation_test(E17.5_GLM_SFARI_cat, "E17.5")
permutation_test(E19.5_GLM_SFARI_cat, "P0")
Enrichment of autism-associated genes in our DE gene set, based on randomly sampled gene sets (bars) vs. observed number (red line).

Enrichment of autism-associated genes in our DE gene set, based on randomly sampled gene sets (bars) vs. observed number (red line).

5.1.2 P value table

P values presented in the manuscript may be slightly different due to unspecified seed for the pseudorandom number generator during project development. These differences are numerically negligible, and do not have any impact on any conclusions of this work.

#Hypergeometric test
SFARI_genes_hypergeometric_test <- function(x){ 
#Anotations on the right refer to the post in the link

#https://stats.stackexchange.com/questions/16247/calculating-the-probability-of-gene-list-overlap-between-an-rna-seq-and-a-chip-c
#x <- E17.5_GLM_module_SFARI_cat
  
  set.seed(1234)
  
  n <- length(as.character(x$gene_name))   #Total gene population 15k
  a <- length(filter(x, PValue < 0.05)$gene_name)  #RNA Seq enriched # 3000
  b <- length(filter(x, SFARI_category %in% c(1,2))$gene_name) #Enriched by Chip-Seq 400
  t <- length(filter(x, PValue < 0.05, SFARI_category %in% c(1,2))$gene_name) #Intersected 100

hypergeometric_test_p_value <- sum(dhyper(t:b, a, n - a, b))
hypergeometric_test_p_value
}

hyper_E12.5 <- SFARI_genes_hypergeometric_test(E12.5_GLM_SFARI_cat)
hyper_E14.5 <- SFARI_genes_hypergeometric_test(E14.5_GLM_SFARI_cat)
hyper_E17.5 <- SFARI_genes_hypergeometric_test(E17.5_GLM_SFARI_cat)
hyper_P0 <- SFARI_genes_hypergeometric_test(E19.5_GLM_SFARI_cat)

df_p_values <-  data.frame("Age" = c("E12.5", "E14.5", "E17.5", "P0"), 
           "Permutation test P" = c("0.54", "0.87", "7.6e-06", "0.23"), 
           "Hypergeometric test P" = c(
             format(hyper_E12.5, scientific = F, digits = 2),
             format(hyper_E14.5, scientific = F, digits = 2),
             format(hyper_E17.5, scientific = T, digits = 2),
             format(hyper_P0, scientific = F, digits = 2)
                                       ))

colnames(df_p_values) <- c("Age", "Permutation test P", "Hypergeometric test P")

#df_p_values

knitr::kable(df_p_values,
           format = "html", table.attr = "style='width:40%;'", align = "l", position = "left") %>% 
  kableExtra::kable_styling(position = "left")
Age Permutation test P Hypergeometric test P
E12.5 0.54 0.33
E14.5 0.87 0.63
E17.5 7.6e-06 4.5e-06
P0 0.23 0.93

6. Individual gene expression trajectories

###
# x is a gene name
# y is a plot title, in case there is an alternative gene name

rpkm_box_plot <- function(x, y){
rpkm_test <- as.data.frame(reshape2::melt(rpkm.data_linear_all[x,]))
rpkm_test_w_info <- cbind(rpkm_test, sample.info[,"ExperimentalDesign"])
rpkm_test_w_info <- cbind(rpkm_test_w_info, as.character(sample.info[,"DPC"]))
rpkm_test_w_info <- cbind(rpkm_test_w_info, as.character(sample.info[,"Condition"]))

colnames(rpkm_test_w_info) <- c("RPKM", "treatment", "DPC", "Condition")

j_brew_colors <- brewer.pal(n = 8, name = "Paired")[c(6,2)]

ggplot(rpkm_test_w_info, aes(x = DPC, y= RPKM, colour=Condition))+
  geom_smooth(formula = y ~ x, method = "loess", se=T, aes(fill=Condition,  group=Condition), size  = 0.7, alpha=0)+
  geom_boxplot(alpha=0, position="identity", size = 0.2)+
  geom_jitter(size=2, pch = 19, width = 0.2, alpha = 0.5)+
  theme_bw()+
  theme(axis.text.x=element_text(angle=50, vjust=0.9, hjust=1, size=14))+
  theme(axis.text.y=element_text(size=12))+
  theme(axis.title.y=element_text(size=14))+
  labs(title= y, x="")+
  theme(plot.title = element_text(size = rel(2), hjust=0.5))+
  scale_color_manual(values = j_brew_colors)+
  scale_x_discrete(breaks =c("12.5", "14.5", "17.5", "19.5"), labels = c("E12.5", "E14.5", "E17.5", "P0"))+
  theme(legend.position = "none",
  panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  theme(legend.position = "none")
}

6.1 Fig. 3d

genes <- c("Vegfa", "Flt1", "Pdk1", "Ldha", "Bnip3", "Il20rb") 

pl <- lapply(genes, function(x) rpkm_box_plot(x,x))


marrangeGrob(pl, nrow=2, ncol=3, top = "", 
             layout_matrix = rbind(c(1,2,3), c(4,5,6)))
**Figure 3d.** RPKM expression plots of genes that are associated with the network of acute DE at E12.5.

Figure 3d. RPKM expression plots of genes that are associated with the network of acute DE at E12.5.

6.2 Fig. 4a

genes <- c("Mki67", "Eomes", "Pax6", "Sox9", "Tbr1", "Bcl11b", "Satb2", "Cux1")
gene_names <- c("Mki67 (Ki67)", "Eomes (Tbr2)", "Pax6", "Sox9", "Tbr1", "Bcl11b (Ctip2)", "Satb2", "Cux1")

df_genes <- data.frame(genes, gene_names)

pl <- lapply(1:nrow(df_genes), function(x) rpkm_box_plot(df_genes[x, 1], df_genes[x, 2]))


marrangeGrob(pl, nrow=4, ncol=2, top = "", 
             layout_matrix = rbind(c(1,2), c(3,4), c(5,6), c(7,8)))
**Figure 4a.** RPKM trajectories of proliferative and cortical lamination markers that were DE and validated at protein level.

Figure 4a. RPKM trajectories of proliferative and cortical lamination markers that were DE and validated at protein level.

6.3 Fig. 6q

genes <- c("Gfap", "Olig2", "Dlx2", "Gad1") 

pl <- lapply(genes, function(x) rpkm_box_plot(x,x))

marrangeGrob(pl, nrow=2, ncol=3, top = "", 
             layout_matrix = rbind(c(1,2), c(3,4)))
**Figure 6q.** RPKM trajectories of the cell identity markers tested in Figure 6a-k across the developmental time-points of the study show parallel differences in mRNA expression of cell-type markers following MIA induction. *Gfap* and *Olig2* mRNA was significantly upregulated in MIA samples at E17.5. *Dlx2* was significantly downregulated at E17.5, with no significant changes observed for Gad1 (encodes GAD67) or for *Pdgfra* (not shown).

Figure 6q. RPKM trajectories of the cell identity markers tested in Figure 6a-k across the developmental time-points of the study show parallel differences in mRNA expression of cell-type markers following MIA induction. Gfap and Olig2 mRNA was significantly upregulated in MIA samples at E17.5. Dlx2 was significantly downregulated at E17.5, with no significant changes observed for Gad1 (encodes GAD67) or for Pdgfra (not shown).

6.4 Fig. S7d-g

genes <- c("Mki67", "Eomes", "Pax6", "Sox9")
gene_names <- c("Mki67 (Ki67)", "Eomes (Tbr2)", "Pax6", "Sox9")

df_genes <- data.frame(genes, gene_names)

pl <- lapply(1:nrow(df_genes), function(x) rpkm_box_plot(df_genes[x, 1], df_genes[x, 2]))


marrangeGrob(pl, nrow=2, ncol=2, top = "", 
             layout_matrix = rbind(c(1,2), c(3,4)))
**Supplementary Figure 7d-g.** RNA-seq RPKM expression trajectories throughout development of Ki67, Tbr2, Pax6, and Sox9.

Supplementary Figure 7d-g. RNA-seq RPKM expression trajectories throughout development of Ki67, Tbr2, Pax6, and Sox9.

7. Sex specific DE

### Male vs Female DE analysis without lane covariate

single_timepoint_glm_sex <- function(x, y){
#x = "12.5"
#y = "M"

control.datapoints <- which(sample.info$Condition == "Saline")
polyic.datapoints <- which(sample.info$Condition == "PolyIC")
  
control.datapoints <- intersect(control.datapoints, which(dpc == x & sex.by.rna == y))
polyic.datapoints <- intersect(polyic.datapoints, which(dpc == x & sex.by.rna == y))

use.cols <- c(control.datapoints, polyic.datapoints)

test.sex.by.rna <- sex.by.rna[use.cols]
test.lane <- as.numeric(lane)[use.cols]
test.data <- exp.data[,use.cols]
test.group <- group[use.cols]

#test.group

min.cpm <- 0.1

design <- model.matrix(~as.numeric(test.group))
y <- DGEList(counts=test.data, group = as.numeric(test.group))
keep <- rowSums(cpm(y)>min.cpm) >=2 #keeps only genes expressed in above min.cpm in at least 2 libraries in each group 
y <- y[keep, , keep.lib.sizes=FALSE]
y <- estimateGLMCommonDisp(y,design)
y <- estimateGLMTrendedDisp(y,design)
y <- estimateGLMTagwiseDisp(y,design)
fit <- glmFit(y,design) 
lrt <- glmLRT(fit) # Genewise Negative Binomial Generalized Linear Models.

glm.output <- topTags(lrt, n=Inf)
glm.output.full <- glm.output$table

glm.output.full$gene_name <- rownames(glm.output.full)
rownames(glm.output.full) <- NULL
glm.output.full <- glm.output.full[,c(6,1:5)]

glm.output.full

}

E12.5_GLM_male <- single_timepoint_glm_sex(12.5, "M")
E14.5_GLM_male <- single_timepoint_glm_sex(14.5, "M")
E17.5_GLM_male <- single_timepoint_glm_sex(17.5, "M")
E19.5_GLM_male <- single_timepoint_glm_sex(19.5, "M")

E12.5_GLM_female <- single_timepoint_glm_sex(12.5, "F")
E14.5_GLM_female <- single_timepoint_glm_sex(14.5, "F")
E17.5_GLM_female <- single_timepoint_glm_sex(17.5, "F")
E19.5_GLM_female <- single_timepoint_glm_sex(19.5, "F")



####################################################################################################

DE_genes_barplot_2 <- function(a,b,c,d, title){

E12.5_up_n <- nrow(filter(a, PValue < 0.05, logFC > 0))
E12.5_down_n <- nrow(filter(a, PValue < 0.05, logFC < 0))

E14.5_up_n <- nrow(filter(b, PValue < 0.05, logFC > 0))
E14.5_down_n <- nrow(filter(b, PValue < 0.05, logFC < 0))

E17.5_up_n <- nrow(filter(c, PValue < 0.05, logFC > 0))
E17.5_down_n <- nrow(filter(c, PValue < 0.05, logFC < 0))

E19.5_up_n <- nrow(filter(d, PValue < 0.05, logFC > 0))
E19.5_down_n <- nrow(filter(d, PValue < 0.05, logFC < 0))

###

E12.5_up_n_FDR <- nrow(filter(a, FDR < 0.05, logFC > 0))
E12.5_down_n_FDR <- nrow(filter(a, FDR < 0.05, logFC < 0))

E14.5_up_n_FDR <- nrow(filter(b, FDR < 0.05, logFC > 0))
E14.5_down_n_FDR <- nrow(filter(b, FDR < 0.05, logFC < 0))

E17.5_up_n_FDR <- nrow(filter(c, FDR < 0.05, logFC > 0))
E17.5_down_n_FDR <- nrow(filter(c, FDR < 0.05, logFC < 0))

E19.5_up_n_FDR <- nrow(filter(d, FDR < 0.05, logFC > 0))
E19.5_down_n_FDR <- nrow(filter(d, FDR < 0.05, logFC < 0))

DE_df_n <- t(data.frame("E12.5" =c(E12.5_up_n, E12.5_down_n, E12.5_up_n_FDR, E12.5_down_n_FDR),
           "E14.5" =c(E14.5_up_n, E14.5_down_n, E14.5_up_n_FDR, E14.5_down_n_FDR),
           "E17.5" =c(E17.5_up_n, E17.5_down_n, E17.5_up_n_FDR, E17.5_down_n_FDR),
           "P0" =c(E19.5_up_n, E19.5_down_n, E19.5_up_n_FDR, E19.5_down_n_FDR)))

colnames(DE_df_n) <- c("Upregulated", "Downregulated", "Upregulated", "Downregulated")


DE_df_n_melted <- reshape2::melt(DE_df_n)

DE_df_n_melted$stat <- c(rep(", P < 0.05", 8), rep(", FDR < 0.05", 8))

DE_df_n_melted$Var2_stat <- paste(DE_df_n_melted$Var2, DE_df_n_melted$stat)

ggplot(DE_df_n_melted, aes(fill=Var2_stat, group=Var2, x=Var1, y=value))+
  geom_bar(position = "dodge",  stat="identity", color="black")+
   theme_bw()+
  #scale_fill_manual(values=brewer.pal(n = 8, name = "Paired")[c(6,6,2,2 )])+
  scale_fill_manual(values=c("#1F78B4", "#62a0ca", "#E31A1C", "#eb5e60"))+
  theme(legend.title=element_blank())+
  
  theme(plot.title = element_text(size = rel(1.4), hjust=0.5))+
  geom_text(aes(label=value), position=position_dodge(width=0.9), hjust=-0.2)+
  coord_flip()+
  scale_x_discrete(limits = rev(levels(DE_df_n_melted$Var1)))+
  labs(title= title, x="", y="")+
  theme(panel.border = element_blank(),
        legend.position = "bottom",
        axis.ticks = element_blank(),
        axis.text.y = element_text(angle = 90, size = 14, face = "bold", 
                                   hjust = 0.5, vjust = -4),
        axis.text.x = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())
  


}

DE_males <- DE_genes_barplot_2(E12.5_GLM_male, 
                 E14.5_GLM_male, 
                 E17.5_GLM_male, 
                 E19.5_GLM_male, "Male DE")

DE_females <- DE_genes_barplot_2(E12.5_GLM_female, 
                 E14.5_GLM_female, 
                 E17.5_GLM_female,
                 E19.5_GLM_female, "Female DE")


grid.arrange(DE_males, DE_females, ncol = 2)
**Supplementary Figure 2cd.** Males and females show largely concordant DE signatures following MIA. Numbers of upregulated and downregulated DE genes with P < 0.05 and FDR < 0.05, for the DE analysis performed on males and females demonstrate robust DE at E17.5 in both sexes.

Supplementary Figure 2cd. Males and females show largely concordant DE signatures following MIA. Numbers of upregulated and downregulated DE genes with P < 0.05 and FDR < 0.05, for the DE analysis performed on males and females demonstrate robust DE at E17.5 in both sexes.

8. The effect of sex covariate on the model

single_timepoint_glm_function_no_sex <- function(x){

control.datapoints <- which(sample.info$Condition == "Saline")
polyic.datapoints <- which(sample.info$Condition == "PolyIC")  
  
control.datapoints <- intersect(control.datapoints, which(dpc == x))
polyic.datapoints <- intersect(polyic.datapoints, which(dpc == x))

use.cols <- c(control.datapoints, polyic.datapoints)

test.dpc <- dpc[use.cols]
test.sex.by.rna <- sex.by.rna[use.cols]
test.response <- response[use.cols]
test.rRNA <- as.numeric(rRNA)[use.cols]
test.group <- group[use.cols]
test.lane <- as.numeric(lane)[use.cols]
test.data <- exp.data[,use.cols]

min.cpm.criteria <- 0.1
min.cpm <- min.cpm.criteria

design <- model.matrix(~as.factor(test.lane) + as.numeric(test.group))
y <- DGEList(counts=test.data, group=group[use.cols])
keep <- rowSums(cpm(y)>min.cpm) >=2 #keeps only genes expressed in above min.cpm in at least 2 libraries in each group 
y <- y[keep, , keep.lib.sizes=FALSE]
y <- estimateGLMCommonDisp(y,design)
y <- estimateGLMTrendedDisp(y,design)
y <- estimateGLMTagwiseDisp(y,design)
fit <- glmFit(y,design) 
lrt <- glmLRT(fit) # Genewise Negative Binomial Generalized Linear Models.

glm.output <- topTags(lrt, n=Inf)
glm.output.full <- glm.output$table

glm.output.full$gene_name <- rownames(glm.output.full)
rownames(glm.output.full) <- NULL
glm.output.full <- glm.output.full[,c(6,1:5)]

glm.output.full

}

E12.5_GLM_no_sex <- single_timepoint_glm_function_no_sex(12.5)
E14.5_GLM_no_sex <- single_timepoint_glm_function_no_sex(14.5)
E17.5_GLM_no_sex <- single_timepoint_glm_function_no_sex(17.5)
E19.5_GLM_no_sex <- single_timepoint_glm_function_no_sex(19.5)


DE_model_wo_sex_covariate <- DE_genes_barplot_2(E12.5_GLM_no_sex, 
                 E14.5_GLM_no_sex, 
                 E17.5_GLM_no_sex, 
                 E19.5_GLM_no_sex, "DE model without sex covariate")


###################


single_timepoint_glm_sex_variable <- function(x){

control.datapoints <- which(sample.info$Condition == "Saline")
polyic.datapoints <- which(sample.info$Condition == "PolyIC")  
  
control.datapoints <- intersect(control.datapoints, which(dpc == x))
polyic.datapoints <- intersect(polyic.datapoints, which(dpc == x))

use.cols <- c(control.datapoints, polyic.datapoints)

test.dpc <- dpc[use.cols]
test.sex.by.rna <- sex.by.rna[use.cols]
test.response <- response[use.cols]
test.rRNA <- as.numeric(rRNA)[use.cols]
test.group <- group[use.cols]
test.lane <- as.numeric(lane)[use.cols]
test.data <- exp.data[,use.cols]

min.cpm.criteria <- 0.1
min.cpm <- min.cpm.criteria

design <- model.matrix(~as.factor(test.lane) + as.numeric(test.group) + as.factor(ifelse(test.sex.by.rna == "M", 1, 2)))
y <- DGEList(counts=test.data, group = as.factor(test.sex.by.rna))
keep <- rowSums(cpm(y)>min.cpm) >=2 #keeps only genes expressed in above min.cpm in at least 2 libraries in each group 
y <- y[keep, , keep.lib.sizes=FALSE]
y <- estimateGLMCommonDisp(y,design)
y <- estimateGLMTrendedDisp(y,design)
y <- estimateGLMTagwiseDisp(y,design)
fit <- glmFit(y,design) 
lrt <- glmLRT(fit) # Genewise Negative Binomial Generalized Linear Models.

glm.output <- topTags(lrt, n=Inf)
glm.output.full <- glm.output$table

glm.output.full$gene_name <- rownames(glm.output.full)
rownames(glm.output.full) <- NULL
glm.output.full <- glm.output.full[,c(6,1:5)]

glm.output.full

}

E12.5_GLM_sex_as_test_variable <- single_timepoint_glm_sex_variable(12.5)
E14.5_GLM_sex_as_test_variable <- single_timepoint_glm_sex_variable(14.5)
E17.5_GLM_sex_as_test_variable <- single_timepoint_glm_sex_variable(17.5)
E19.5_GLM_sex_as_test_variable <- single_timepoint_glm_sex_variable(19.5)


DE_model_sex_as_test_variable <- DE_genes_barplot_2(E12.5_GLM_sex_as_test_variable, 
                 E14.5_GLM_sex_as_test_variable, 
                 E17.5_GLM_sex_as_test_variable, 
                 E19.5_GLM_sex_as_test_variable, "DE model with sex \n as the experimental variable. \n Male vs Female")


grid.arrange(DE_model_wo_sex_covariate, DE_model_sex_as_test_variable, ncol = 2)
 **Supplementary Figure 2ab.** Males and females show largely concordant DE signatures following MIA. (left) Numbers of upregulated and downregulated DE genes with P < 0.05 and FDR < 0.05, in the DE analysis lacking the sex covariate, are similar to the DE model including the sex covariate (Figure 2c). (right) Numbers of upregulated and downregulated DE genes with P < 0.05 and FDR < 0.05, in a DE analysis testing for differential expression between sexes identifies few DE genes, suggesting limited sex dimorphism between samples.

Supplementary Figure 2ab. Males and females show largely concordant DE signatures following MIA. (left) Numbers of upregulated and downregulated DE genes with P < 0.05 and FDR < 0.05, in the DE analysis lacking the sex covariate, are similar to the DE model including the sex covariate (Figure 2c). (right) Numbers of upregulated and downregulated DE genes with P < 0.05 and FDR < 0.05, in a DE analysis testing for differential expression between sexes identifies few DE genes, suggesting limited sex dimorphism between samples.

9. Male vs Female DE

9.1 Effect size correlations

male_vs_female_DE <- function(x, y, z, title){

#x <- E17.5_GLM_male
#y <- E17.5_GLM_female
#z <- E17.5_GLM

a <- dplyr::select(x, gene_name, logFC, PValue, FDR)
b <- dplyr::select(y, gene_name, logFC, PValue, FDR)

colnames(a) <- c("gene_name", paste0(colnames(a)[2:4], "_", "male"))
colnames(b) <- c("gene_name", paste0(colnames(b)[2:4], "_", "female"))

df <- merge(a, b, by ="gene_name", all=T)

FDR_geneset <- dplyr::filter(z, FDR < 0.05)$gene_name

df <- dplyr::filter(df, gene_name %in% FDR_geneset)

df$Sig <- ifelse(df$FDR_male < 0.05, "Males", "Only in DE model including both sexes")
df$Sig <- ifelse(df$FDR_female < 0.05, "Females", df$Sig)
df$Sig <- ifelse(df$FDR_female < 0.05 & df$FDR_male < 0.05, "Independently in both sexes", df$Sig)


#Capping at -4 and +4
df$logFC_male <- ifelse(df$logFC_male > 4, 4, df$logFC_male)
df$logFC_male <- ifelse(df$logFC_male < -4, -4, df$logFC_male)
df$logFC_female <- ifelse(df$logFC_female > 4, 4, df$logFC_female)
df$logFC_female <- ifelse(df$logFC_female < -4, -4, df$logFC_female)

ggplot()+
  geom_abline(intercept = 0, slope =1)+
  geom_point(data = filter(df, Sig == "Only in DE model including both sexes"), 
             aes(x = logFC_male, y = logFC_female, color = Sig, label = gene_name), alpha = 0.4,  size = 2)+
  geom_point(data = filter(df, Sig == "Females"), 
             aes(x = logFC_male, y = logFC_female, color = Sig, label = gene_name), alpha = 0.4,  size = 2)+
  geom_point(data = filter(df, Sig == "Males"), 
             aes(x = logFC_male, y = logFC_female, color = Sig, label = gene_name), alpha = 0.4,  size = 2)+
  geom_point(data = filter(df, Sig == "Independently in both sexes"), 
             aes(x = logFC_male, y = logFC_female, color = Sig, label = gene_name), alpha = 0.4,  size = 2)+
  theme_bw()+
 
  coord_cartesian(xlim = c(-4, 4), ylim = c(-4, 4))+
  labs(x="log2FC male", y="log2FC female", title = title, color = "Genes passing FDR < 0.05 \n in the DE model including both sexes \n also DE with FDR < 0.05 in:")+
  theme(plot.title = element_text(hjust = 0.5),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()
        )+
  scale_color_manual(values = c("Females" = "#619CFF", 
                                "Males" = "#F8766D", 
                                "Independently in both sexes" = "#0e8231", 
                                "Only in DE model including both sexes" = "grey"),
                     breaks = c("Females", 
                                "Males", 
                                "Independently in both sexes",
                                "Only in DE model including both sexes"))+
  geom_hline(yintercept=0, linetype=2)+
  geom_vline(xintercept=0, linetype=2)

}

E12.5_mf <- male_vs_female_DE(E12.5_GLM_male, E12.5_GLM_female, E12.5_GLM, "E12.5")
E14.5_mf <- male_vs_female_DE(E14.5_GLM_male, E14.5_GLM_female, E14.5_GLM, "E14.5")
E17.5_mf <- male_vs_female_DE(E17.5_GLM_male, E17.5_GLM_female, E17.5_GLM, "E17.5")


grid.arrange(E12.5_mf+theme(legend.position = "none"), 
             E14.5_mf+theme(legend.position = "none"),
             E17.5_mf, 
             ncol = 3, widths = c(1,1,1.8))
**Supplementary Figure 3.** Sex-stratified comparison of DE genes and effect sizes suggest increased DE effect sizes in females. Correlation of the relative effect sizes (log2FC) for genes passing FDR < 0.05 identifies high degree of concordance between MIA responses between sexes. Genes passing FDR threshold in either or both sexes are color coded.

Supplementary Figure 3. Sex-stratified comparison of DE genes and effect sizes suggest increased DE effect sizes in females. Correlation of the relative effect sizes (log2FC) for genes passing FDR < 0.05 identifies high degree of concordance between MIA responses between sexes. Genes passing FDR threshold in either or both sexes are color coded.

# save.image("G:/Shared drives/Nord Lab - Computational Projects/MIA/eLIFE_Clean_code_for_GitHub/temp.RData")

# load("G:/Shared drives/Nord Lab - Computational Projects/MIA/eLIFE_Clean_code_for_GitHub/temp.RData")

9.2 Venn diagram intersections

9.2.1 E14.5

# BiocManager::install("RBGL") - required by Vennerable
# install.packages("Vennerable", repos="http://R-Forge.R-project.org")

# These plots can be further customized with VennThemes, or in the Illustrator...  
library(Vennerable)

# E14.5
DE_model_including_both_sexes <- filter(E14.5_GLM, FDR < 0.05)$gene_name
Female_DE <- filter(E14.5_GLM_female, FDR < 0.05)$gene_name

ll <- list(DE_model_including_both_sexes, Female_DE)
llv <- Venn(ll, SetNames = c("DE_model_including_both_sexes", "Female_DE"))

plot(llv, doWeights = TRUE, type = "circles")

9.2.2 E17.5

library(Vennerable)

DE_model_including_both_sexes <- filter(E17.5_GLM, FDR < 0.05)$gene_name
Male_DE <- filter(E17.5_GLM_male, FDR < 0.05)$gene_name
Female_DE <- filter(E17.5_GLM_female, FDR < 0.05)$gene_name

ll <- list(DE_model_including_both_sexes, Male_DE, Female_DE)
llv <- Venn(ll, SetNames = c("DE_model_including_both_sexes", "Male_DE", "Female_DE"))

#plot(llv, doWeights = FALSE)
#plot(llv, doWeights = TRUE)

# Note. Editing Vennerable plos is very difficult, especially in Rmd framework. Consider switching to ggVennDiagram. 

myplot1 <- function(){plot(llv, doWeights = TRUE)}
myplot2 <- function(){plot(llv, doWeights = FALSE)}

p1 <- grid.grabExpr(myplot1())
p2 <- grid.grabExpr(myplot2())

grid.arrange(p1, p2, ncol=2) 
**Supplementary Figure 3b**. Venn diagrams illustrating overlap between DE gene sets passing FDR < 0.05 at E14.5 and E17.5, in DE models containing samples from both sexes, males, or females. The DE model comprising individuals of both sexes includes most of the DE signature of each sex. Female DE analysis identifies limited and unique DE signature.

Supplementary Figure 3b. Venn diagrams illustrating overlap between DE gene sets passing FDR < 0.05 at E14.5 and E17.5, in DE models containing samples from both sexes, males, or females. The DE model comprising individuals of both sexes includes most of the DE signature of each sex. Female DE analysis identifies limited and unique DE signature.

9.3 Female but not Male DE genes

9.3.1 E14.5

# The order of bars is not exactly the same as in the paper. I decided to omit this due to its complication. 

rpkm_box_plot_sex_single_dpc <- function(x, y){
  # x <- "Vegfa"
  # y <- "14.5"
rpkm_test <- as.data.frame(reshape2::melt(rpkm.data_linear[x,]))
rpkm_test_w_info <- cbind(rpkm_test, sample.info[,"ExperimentalDesign"])
rpkm_test_w_info <- cbind(rpkm_test_w_info, as.character(sample.info[,"DPC"]))
rpkm_test_w_info <- cbind(rpkm_test_w_info, as.character(sample.info[,"Condition"]))
rpkm_test_w_info <- cbind(rpkm_test_w_info, as.character(sample.info[,"sex.by.rna"]))


colnames(rpkm_test_w_info) <- c("RPKM", "treatment", "DPC", "Condition", "Sex")

rpkm_test_w_info$Sex_Condition <- as.character(paste0(rpkm_test_w_info$Sex, "_", rpkm_test_w_info$Condition)) 

rpkm_test_w_info$Sex_Condition_DPC <- as.character(paste0(rpkm_test_w_info$Sex_Condition, "_", rpkm_test_w_info$DPC)) 
rpkm_test_w_info$Sex <- ifelse(rpkm_test_w_info$Sex == "F", "Female", "Male")

j_brew_colors <- brewer.pal(n = 8, name = "Paired")[c(6,2)]

rpkm_test_w_info <- filter(rpkm_test_w_info, DPC == y)

ggplot(rpkm_test_w_info, aes(x = x, y= RPKM, colour=Condition, shape = Sex, group = Sex_Condition_DPC))+
  
  geom_jitter(size=2, alpha = 0.5, position = position_jitterdodge(jitter.width = 0.05))+
  
  geom_boxplot(alpha=0, position = position_dodge(width=0.75))+
  
  theme_bw()+
  theme(axis.text.x=element_text(size=12, face="bold"))+
  labs(title= x, x="")+
  theme(plot.title = element_text(size = rel(2), hjust=0.5),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(),
        legend.position = "none")+
  scale_color_manual(values = j_brew_colors)
  
  
}



# Gene selection criteria
# FDR < 0.5 and logFC > 0.5 or < -0.5 in females
# and not logFC > 0.5 or < -0.5 in males. 

f <- filter(E14.5_GLM_female, FDR < 0.05, logFC > 0.5 | logFC < -0.5)$gene_name   #238 genes

m <- filter(E14.5_GLM_male, logFC > 0.5 | logFC < -0.5)$gene_name #2593 genes, FDR criteria removed because of 0 genes passing threshold.

f_but_not_m <- setdiff(f, m)

genes <- filter(E14.5_GLM_female, gene_name %in% f_but_not_m)$gene_name

pl <- lapply(genes[1:12], function(x) rpkm_box_plot_sex_single_dpc(x, "14.5"))


marrangeGrob(list(pl[[1]],pl[[2]], pl[[3]], pl[[4]], pl[[5]],pl[[6]],
                  pl[[7]], pl[[8]], pl[[9]], pl[[10]], pl[[11]],
                     pl[[12]]+theme(legend.position = "right")), 
                   nrow=2, ncol=6, top = "", layout_matrix = rbind(c(1:6), c(7:12)))

9.3.2 E17.5

f <- filter(E17.5_GLM_female, FDR < 0.05, logFC > 0.5 | logFC < -0.5)$gene_name   #3037 genes
m <- filter(E17.5_GLM_male, FDR < 0.05, logFC > 0.5 | logFC < -0.5)$gene_name #1048 genes, 

f_but_not_m <- setdiff(f, m)

genes <- filter(E17.5_GLM_female, gene_name %in% f_but_not_m)$gene_name

pl <- lapply(genes[1:12], function(x) rpkm_box_plot_sex_single_dpc(x, "17.5"))


marrangeGrob(list(pl[[1]],pl[[2]], pl[[3]], pl[[4]], pl[[5]],pl[[6]],
                  pl[[7]], pl[[8]], pl[[9]], pl[[10]], pl[[11]],
                     pl[[12]]+theme(legend.position = "right")), 
                   nrow=2, ncol=6, top = "", layout_matrix = rbind(c(1:6), c(7:12)))
**Supplementary Figure 3c.** Examples of E14.5 and E17.5 genes significant in females but not males. At E14.5 genes were selected to pass FDR < 0.05 and logFC > 0.5 or logFC < -0.5 in females, but not logFC > 0.5 or logFC < -0.5 in males. The gene-set selection process was analogous for E17.5 with the addition of FDR < 0.05 condition in males. After gene set filtering, twelve genes with the lowest FDR in females were plotted.

Supplementary Figure 3c. Examples of E14.5 and E17.5 genes significant in females but not males. At E14.5 genes were selected to pass FDR < 0.05 and logFC > 0.5 or logFC < -0.5 in females, but not logFC > 0.5 or logFC < -0.5 in males. The gene-set selection process was analogous for E17.5 with the addition of FDR < 0.05 condition in males. After gene set filtering, twelve genes with the lowest FDR in females were plotted.

9.4 Sex stratifies DE genes

rpkm_box_plot_sex <- function(x){
  #x <- "Vegfa"
rpkm_test <- as.data.frame(reshape2::melt(rpkm.data_linear[x,]))
rpkm_test_w_info <- cbind(rpkm_test, sample.info[,"ExperimentalDesign"])
rpkm_test_w_info <- cbind(rpkm_test_w_info, as.character(sample.info[,"DPC"]))
rpkm_test_w_info <- cbind(rpkm_test_w_info, as.character(sample.info[,"Condition"]))
rpkm_test_w_info <- cbind(rpkm_test_w_info, as.character(sample.info[,"sex.by.rna"]))


colnames(rpkm_test_w_info) <- c("RPKM", "treatment", "DPC", "Condition", "Sex")

rpkm_test_w_info$Sex_Condition <- as.character(paste0(rpkm_test_w_info$Sex_by_RNA, "_", rpkm_test_w_info$Condition)) 

rpkm_test_w_info$Sex_Condition_DPC <- as.character(paste0(rpkm_test_w_info$Sex_Condition, "_", rpkm_test_w_info$DPC)) 

head(rpkm_test_w_info)


j_brew_colors <- brewer.pal(n = 8, name = "Paired")[c(6,2)]

ggplot(rpkm_test_w_info, aes(x = DPC, y= RPKM, colour=Condition, shape = Sex, group = Sex_Condition))+
  geom_jitter(size=2, alpha = 0.5, position = position_jitterdodge(jitter.width = 0.2))+
  geom_boxplot(alpha=0, position = position_dodge(width=0.75), aes(group = Sex_Condition_DPC))+
  theme_bw()+
  theme(axis.text.x=element_text(size=12, face="bold"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  labs(title= x, x="")+
  theme(plot.title = element_text(size = rel(2), hjust=0.5))+
  scale_color_manual(values = j_brew_colors)+
  scale_fill_manual(values = j_brew_colors)+
  scale_x_discrete(breaks =c("12.5", "14.5", "17.5", "19.5"), labels = c("E12.5", "E14.5", "E17.5", "P0"))
  
}

9.4.1

# S6a
x <- c("Vegfa", "Pdk1", "Bnip3", "Flt1", "Ldha",  "Il20rb")
pl <- lapply(x, function(x) FUN={
  rpkm_box_plot_sex(x)+theme(legend.position = "none")
})

marrangeGrob(pl, nrow = 3, ncol=2, top = "")

9.4.2

# S6b
### ###
x <- c("Mki67", "Pax6",  "Tbr1", "Satb2", "Eomes", "Sox9",  "Bcl11b",  "Cux1")
pl <- lapply(x, function(x) FUN={
  rpkm_box_plot_sex(x)+theme(legend.position = "none")
})

marrangeGrob(pl, nrow = 4, ncol=2, top = "")

9.4.3

# S6c
x <- c("Gfap", "Olig2",  "Dlx2", "Gad1")
pl <- lapply(x, function(x) FUN={
  rpkm_box_plot_sex(x)+theme(legend.position = "none")
})

marrangeGrob(pl, nrow = 2, ncol=2, top = "")
**Supplementary Figure 6.** Sex-stratified RPKM gene trajectories of genes presented in the manuscript showing concordant effects between males and females. (9.4.1) Vegfa, Flt1, Pdk1, Ldha, Bnip3, Il20rb. (9.4.2) Mki67, Eomes, Pax6, Sox9, Tbr1, Bcl11b, Satb2, Cux1. (9.4.3) Gfap, Dlx2, Olig2, Gad1.

Supplementary Figure 6. Sex-stratified RPKM gene trajectories of genes presented in the manuscript showing concordant effects between males and females. (9.4.1) Vegfa, Flt1, Pdk1, Ldha, Bnip3, Il20rb. (9.4.2) Mki67, Eomes, Pax6, Sox9, Tbr1, Bcl11b, Satb2, Cux1. (9.4.3) Gfap, Dlx2, Olig2, Gad1.

10. R sessionInfo()

library(pander)

pander(sessionInfo())

R version 4.0.2 (2020-06-22)

Platform: x86_64-w64-mingw32/x64 (64-bit)

locale: LC_COLLATE=English_United States.1252, LC_CTYPE=English_United States.1252, LC_MONETARY=English_United States.1252, LC_NUMERIC=C and LC_TIME=English_United States.1252

attached base packages: grid, stats4, parallel, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: pander(v.0.6.3), Vennerable(v.3.0), xtable(v.1.8-4), gtools(v.3.8.2), reshape(v.0.8.8), RBGL(v.1.64.0), graph(v.1.66.0), biomaRt(v.2.44.4), Hmisc(v.4.4-2), Formula(v.1.2-4), survival(v.3.1-12), lattice(v.0.20-41), plotly(v.4.9.2.2), plyr(v.1.8.6), gridExtra(v.2.3), ggrepel(v.0.8.2), knitr(v.1.30), RColorBrewer(v.1.1-2), DT(v.0.16), data.table(v.1.12.8), dplyr(v.1.0.0), reshape2(v.1.4.4), ggplot2(v.3.3.2), mclust(v.5.4.7), GenomicFeatures(v.1.40.1), AnnotationDbi(v.1.50.3), Biobase(v.2.48.0), GenomicRanges(v.1.40.0), GenomeInfoDb(v.1.24.2), IRanges(v.2.22.2), S4Vectors(v.0.26.1), BiocGenerics(v.0.34.0), edgeR(v.3.30.3), limma(v.3.44.3), pheatmap(v.1.0.12), RUVnormalize(v.1.22.0), sva(v.3.36.0), BiocParallel(v.1.22.0), genefilter(v.1.70.0), mgcv(v.1.8-31) and nlme(v.3.1-148)

loaded via a namespace (and not attached): colorspace(v.1.4-1), ellipsis(v.0.3.1), htmlTable(v.2.1.0), XVector(v.0.28.0), base64enc(v.0.1-3), rstudioapi(v.0.13), farver(v.2.0.3), bit64(v.4.0.5), xml2(v.1.3.2), codetools(v.0.2-16), splines(v.4.0.2), jsonlite(v.1.7.2), Cairo(v.1.5-12.2), Rsamtools(v.2.4.0), annotate(v.1.66.0), cluster(v.2.1.0), dbplyr(v.2.0.0), png(v.0.1-7), compiler(v.4.0.2), httr(v.1.4.2), backports(v.1.1.7), assertthat(v.0.2.1), Matrix(v.1.2-18), lazyeval(v.0.2.2), htmltools(v.0.5.0), prettyunits(v.1.1.1), tools(v.4.0.2), gtable(v.0.3.0), glue(v.1.4.1), GenomeInfoDbData(v.1.2.3), rappdirs(v.0.3.1), Rcpp(v.1.0.6), vctrs(v.0.3.0), Biostrings(v.2.56.0), rtracklayer(v.1.48.0), crosstalk(v.1.1.0.1), xfun(v.0.15), stringr(v.1.4.0), rvest(v.0.3.6), lifecycle(v.0.2.0), XML(v.3.99-0.3), zlibbioc(v.1.34.0), scales(v.1.1.1), hms(v.0.5.3), SummarizedExperiment(v.1.18.2), yaml(v.2.2.1), curl(v.4.3), memoise(v.1.1.0), rpart(v.4.1-15), latticeExtra(v.0.6-29), stringi(v.1.4.6), RSQLite(v.2.2.1), highr(v.0.8), checkmate(v.2.0.0), RUVnormalizeData(v.1.8.0), rlang(v.0.4.9), pkgconfig(v.2.0.3), matrixStats(v.0.57.0), bitops(v.1.0-6), evaluate(v.0.14), purrr(v.0.3.4), labeling(v.0.4.2), GenomicAlignments(v.1.24.0), htmlwidgets(v.1.5.3), bit(v.4.0.4), tidyselect(v.1.1.0), magrittr(v.2.0.1), R6(v.2.5.0), generics(v.0.1.0), DelayedArray(v.0.14.1), DBI(v.1.1.0), foreign(v.0.8-80), pillar(v.1.4.7), withr(v.2.3.0), nnet(v.7.3-14), RCurl(v.1.98-1.2), tibble(v.3.0.1), crayon(v.1.3.4), BiocFileCache(v.1.12.1), rmarkdown(v.2.6), jpeg(v.0.1-8.1), progress(v.1.2.2), locfit(v.1.5-9.4), blob(v.1.2.1), webshot(v.0.5.2), digest(v.0.6.25), tidyr(v.1.1.0), openssl(v.1.4.3), munsell(v.0.5.0), kableExtra(v.1.3.1), viridisLite(v.0.3.0) and askpass(v.1.1)

# Thw workspace is saved at save.image("G:/Shared drives/Nord Lab - Computational Projects/MIA/Canales_et_al_eLIFE_DE_repo/Canales_eLIFE_2021_DE/Canales_2021_eLIFE_DE_workspace.RData")